################################
## Script: 06_gpt_fine_tune.R
## Purpose: This code fine tunes GPT4o with training examples. 
## It creates fine tuned models that we use for LLM annotation. 
## Data In:
## data/fine_tune_examples.rds
## Data Out:
## 1) data/train_fine_tune_ids.rds
## article IDs used in same claim model fine tuning
## 2) data/train_fine_tune_ids_subject.rds
## article IDs used in same subject model fine tuning
## 3) full datset of fine tuning data:
## a) same claim:
## data/fine_tune_same_claim_training_data.rds
## b) same subject:
## data/fine_tune_same_subject_training_data.rds"
## 4) Two fine tuned models, save on OpenAI servers:
## a) same subject model: ft:gpt-4o-2024-08-06:personal::Acor6lGL
## b) same claim model: ft:gpt-4o-2024-08-06:personal::AcobyL9Q
## 5) Test data set labels:
## a) same subject test data :data/fine_tune_test_same_subject.rds
## b) same claim test data: data/fine_tune_test_same_claim.rds

## Notes:

library(tidyverse)
library(jsonlite)
library(httr)
library(openai)

#########################################
#### Read Files ######################
#########################################


#OPENAI_API_KEY = '' ## include 

## fine tuning data
## these were created based on our
## hand labeled precision data 
## pulled together in code script 09_estimate_precision.R
## the logic is that our precision sampling process
## randomly sampled cases at the boundaries of different estimates
## 200 predicted positive cases from our zero shot gpt4o
## annotater (100 original, 100 strict)
## 90 from our ngram estimator
## 112 relatio
## 120 sbert
## 100 stm 
## the goal is to approximate random sampling by identifying cases
## at the margins of different classifiers 

examples <- readRDS("data/fine_tune_examples.rds")
examples$same_subject <- dplyr::recode(examples$same_subject,
                                       `no` = "NO",
                                       `yes` = "YES")
examples$final <- dplyr::recode(examples$final,
                                `no` = "NO",
                                `yes` = "YES")


## removing missing data 
example_same_subject <- examples %>%
  filter(!is.na(same_subject))
example_same_claim <- examples %>%
  filter(!is.na(final))


######################################
### Create Training Data #############
######################################

prompt_compare_subject <- "You will be provided with the lists of the people, places, objects, and events discussed in two paragraphs. Based on these
lists, do the two paragraphs discuss the vast majority of the same people, places, objects, and events?"
prompt_compare_claim <- "You will be provided with the lists of descriptive, normative, conceptual, and causal claims discussed in two paragraphs. 
Based on these lists, do the two paragraphs discuss the vast majority of the same claims?" 



## split into training and validation
## balance by measure type, yes/no

example_same_subject$split <- paste(example_same_subject$measure,
                                    example_same_subject$same_subject,
                                    sep = "_")
example_same_subject <- split(example_same_subject,
                              example_same_subject$split)

## same subject sample: 
set.seed(97405)
for(i in 1:length(example_same_subject)){
  n <- nrow(example_same_subject[[i]])
  
  ## 3/4 training 1/4 validation split
  train_total <- round(3/4 * n)
  
  indices <- sample(1:nrow(example_same_subject[[i]]),
                    train_total,
                    replace = FALSE)
  
  example_same_subject[[i]]$train_test <- "test"
  example_same_subject[[i]]$train_test[indices] <- "train"
}
example_same_subject <- bind_rows(example_same_subject)


example_same_claim$split <- paste(example_same_claim$measure,
                                  example_same_claim$same_subject,
                                    sep = "_")
example_same_claim <- split(example_same_claim,
                            example_same_claim$split)

## same claim sample: 
for(i in 1:length(example_same_claim)){
  n <- nrow(example_same_claim[[i]])
  train_total <- round(3/4 * n)
  indices <- sample(1:nrow(example_same_claim[[i]]),
                    train_total,
                    replace = FALSE)
  example_same_claim[[i]]$train_test <- "test"
  example_same_claim[[i]]$train_test[indices] <- "train"
}
example_same_claim <- bind_rows(example_same_claim)


## saving for record - all unique article IDs used in same 
## claim training 
train_ids <- c(example_same_claim$ego_id[example_same_claim$train_test == "train"],
               example_same_claim$alter_id[example_same_claim$train_test == "train"])
train_ids <- unique(train_ids)
#saveRDS(train_ids, "data/train_fine_tune_ids.rds")

## subject training 
subject_ids <- c(example_same_subject$ego_id[example_same_subject$train_test == "train"],
                 example_same_subject$alter_id[example_same_subject$train_test == "train"])
subject_ids <- unique(subject_ids)
#saveRDS(subject_ids, "data/train_fine_tune_ids_subject.rds")


## save match IDS
#saveRDS(example_same_claim, "data/fine_tune_same_claim_training_data.rds")
#saveRDS(example_same_subject, "data/fine_tune_same_subject_training_data.rds")



##################################
### Format Training Data #########
##################################

example_same_subject_train <- example_same_subject %>%
  filter(train_test == "train")
example_same_claim_train <- example_same_claim %>%
  filter(train_test == "train")


## format training data into 
## message format used by OpenAI GPT models

training_subject <- list()
for(i in 1:nrow(example_same_subject_train)){
  training_subject[[i]] <- list(
    messages = list(
      list(role = "system", content = prompt_compare_subject),
      list(role = "user", content = paste0("\n **Paragraph 1**: ", example_same_subject_train$ego_subject[i],
                                           "\n **Paragraph 2**: ", example_same_subject_train$alter_subject[i],
                                           "\n", "**Your label (Respond only with 'YES' or 'NO')**:")),
      list(role = "assistant", content = example_same_subject_train$same_subject[i])
    )
  )
  
}

training_claim <- list()
for(i in 1:nrow(example_same_claim_train)){
  training_claim[[i]] <- list(
    messages = list(
      list(role = "system", content = prompt_compare_claim),
      list(role = "user", content = paste0("\n **Paragraph 1**: ", example_same_claim_train$ego_claim[i],
                                           "\n **Paragraph 2**: ", example_same_claim_train$alter_claim[i],
                                           "\n", "**Your label (Respond only with 'YES' or 'NO')**:")),
      list(role = "assistant", content = example_same_claim_train$final[i])
    )
  )
  
}


# Format into JSONL
# Convert the dataset to JSONL format
jsonl_data_subject <- sapply(training_subject, toJSON, auto_unbox = TRUE)
jsonl_data_claim <- sapply(training_claim, toJSON, auto_unbox = TRUE)


# Write each JSON object to a new line in a .jsonl file
fileConn <- file("data/training_gpt4o/training_data_claim.jsonl")
#writeLines(jsonl_data_claim, fileConn)
close(fileConn)

fileConn <- file("data/training_gpt4o/training_data_subject.jsonl")
#writeLines(jsonl_data_subject, fileConn)
close(fileConn)


##############################
### Same Claim Training ###
##############################

# Define the API key and endpoint
upload_endpoint <- "https://api.openai.com/v1/files"

# Upload the training file
response <- POST(
  url = upload_endpoint,
  add_headers(Authorization = paste("Bearer", OPENAI_API_KEY)),
  body = list(
    file = upload_file("data/training_gpt4o/training_data_claim.jsonl"),
    purpose = "fine-tune"
  )
)

# Check the response
content(response, as = "parsed")

fine_tune_endpoint <- "https://api.openai.com/v1/fine_tuning/jobs"

response_fine_tune <- POST(
  url = fine_tune_endpoint,
  add_headers(Authorization = paste("Bearer", OPENAI_API_KEY)),
  body = list(
    training_file = content(response, as = "parsed")$id, # Replace with the file ID
    model = "gpt-4o-2024-08-06" # Choose the model to fine-tune
  ),
  encode = "json"
)

# Check the response
content(response_fine_tune, as = "parsed")


fine_tune_job_id <- content(response_fine_tune, as = "parsed")$id 
status_endpoint <- paste0(fine_tune_endpoint, "/", fine_tune_job_id)

response_status <- GET(
  url = status_endpoint,
  add_headers(Authorization = paste("Bearer", OPENAI_API_KEY))
)

# Check the status
content(response_status, as = "parsed")

## job completed, 
## model: ft:gpt-4o-2024-08-06:personal::AcobyL9Q

##############################
#### Same Claim Validation ###
##############################

## Here we use our fine tuned same claim
## model to see how it performs in the test set 
## sampled above

## Note: we don't report these estimates in 
## paper as the training data is not a random sample
## we used the test set to see if this was a reasonable
## approach before undertaking our expensive hand labeling test. 
## We provide an intermediary file below so that replicators
## don't need to re-run these examples. 

example_same_claim_test <- example_same_claim %>%
  filter(train_test == "test")


validation_out_claim <- list()
for(i in 100:nrow(example_same_claim_test)){
  prompt <- paste0("\n **Paragraph 1**: ", example_same_claim_test$ego_claim[i],
                   "\n **Paragraph 2**: ", example_same_claim_test$alter_claim[i],
                   "\n", "**Your label (Respond only with 'YES' or 'NO')**:")
  validation_out_claim[[i]] <- create_chat_completion(model = "ft:gpt-4o-2024-08-06:personal::AcobyL9Q",
                                                      messages = list(list(role = "system",
                                                                           content = prompt_compare_claim),
                                                                      list(role = "user",
                                                                           content = prompt)),
                                                      temperature = 0)
  print(i)
}

labels <- lapply(validation_out_claim,
                 function(x){x$choices$message.content})
labels <- unlist(labels)


## saving output 
#saveRDS(labels, "data/fine_tune_test_same_claim.rds")
labels <- readRDS("data/fine_tune_test_same_claim.rds")

table(labels,
      example_same_claim_test$final)
33 / (33 + 4) ## 89.2% precision 


## removing articles that had ego or alter in training
## by construction no training data pairs are in the test dataset
## but some individual articles may be included as an ego or 
## alter article in a different pair. We do a second precision
## test on the pairs which had no individual articles in the training 
## set (i.e. no pairs where one of the articles was an ego or alter
## article in a separate pair in the training set)
## Note: we do this test in all our analyses 
toremove <- which(example_same_claim_test$ego_id %in% example_same_claim_train$ego_id |
        example_same_claim_test$alter_id %in% example_same_claim_train$ego_id |
        example_same_claim_test$ego_id %in% example_same_claim_train$alter_id |
      example_same_claim_test$alter_id %in% example_same_claim_train$alter_id)
'%ni%' <- Negate("%in%")
toinclude <- 1:nrow(example_same_claim_test) %ni% toremove
table(labels[toinclude],
      example_same_claim_test$final[toinclude])
21 / (21 + 0) ## 100% precision 


##############################
### Same Subject Training ###
##############################

# Upload the training file
response_subject <- POST(
  url = upload_endpoint,
  add_headers(Authorization = paste("Bearer", OPENAI_API_KEY)),
  body = list(
    file = upload_file("data/training_gpt4o/training_data_subject.jsonl"),
    purpose = "fine-tune"
  )
)

# Check the response
content(response_subject, as = "parsed")

fine_tune_endpoint <- "https://api.openai.com/v1/fine_tuning/jobs"

response_fine_tune_subject <- POST(
  url = fine_tune_endpoint,
  add_headers(Authorization = paste("Bearer", OPENAI_API_KEY)),
  body = list(
    training_file = content(response_subject, as = "parsed")$id, 
    model = "gpt-4o-2024-08-06" # Choose the model to fine-tune
  ),
  encode = "json"
)

# Check the response
content(response_fine_tune_subject, as = "parsed")

fine_tune_job_id_subject <- content(response_fine_tune_subject, as = "parsed")$id # Replace with your fine-tune job ID
status_endpoint_subject <- paste0(fine_tune_endpoint, "/", fine_tune_job_id_subject)

response_status <- GET(
  url = status_endpoint_subject,
  add_headers(Authorization = paste("Bearer", OPENAI_API_KEY))
)

# Check the status
content(response_status, as = "parsed")

## same subject model: ft:gpt-4o-2024-08-06:personal::Acor6lGL

##############################
#### Same Subject Validation ###
##############################

example_same_subject_test <- example_same_subject %>%
  filter(train_test == "test")

## We provide an intermediary file below so that replicators
## don't need to re-run these examples. 

validation_out_subject <- list()
for(i in 1:nrow(example_same_subject_test)){
  prompt <- paste0("\n **Paragraph 1**: ", example_same_subject_test$ego_subject[i],
                   "\n **Paragraph 2**: ", example_same_subject_test$alter_subject[i],
                   "\n", "**Your label (Respond only with 'YES' or 'NO')**:")
  

  validation_out_subject[[i]] <- create_chat_completion(model = "ft:gpt-4o-2024-08-06:personal::Acor6lGL",
                                                      messages = list(list(role = "system",
                                                                           content = prompt_compare_subject),
                                                                      list(role = "user",
                                                                           content = prompt)),
                                                      temperature = 0)
  print(i)
}

labels <- lapply(validation_out_subject,
                 function(x){x$choices$message.content})
labels <- unlist(labels)



## saving output 
#saveRDS(labels, "data/fine_tune_test_same_subject.rds")
labels <- readRDS("data/fine_tune_test_same_subject.rds")


table(labels,
      example_same_subject_test$same_subject)
37 / (37 + 4) ## 90.2% precision

## removing articles that had ego or alter in training
## by construction no training data pairs are in the test dataset
## but some individual articles may be included as an ego or 
## alter article in a different pair. We do a second precision
## test on the pairs which had no individual articles in the training 
## set (i.e. no pairs where one of the articles was an ego or alter
## article in a separate pair in the training set)

toremove <- which(example_same_subject_test$ego_id %in% example_same_subject_train$ego_id |
                    example_same_subject_test$alter_id %in% example_same_subject_train$ego_id |
                    example_same_subject_test$ego_id %in% example_same_subject_train$alter_id |
                    example_same_subject_test$alter_id %in% example_same_subject_train$alter_id)
'%ni%' <- Negate("%in%")
toinclude <- 1:nrow(example_same_subject_test) %ni% toremove
table(labels[toinclude],
      example_same_subject_test$same_subject[toinclude])
16 / (16+2) ## 88.9% precision 
